{
 "cells": [
  {
   "cell_type": "code",
   "id": "initial_id",
   "metadata": {
    "collapsed": true,
    "ExecuteTime": {
     "end_time": "2025-05-20T04:22:58.285952Z",
     "start_time": "2025-05-20T04:22:58.283607Z"
    }
   },
   "source": "from transformers import AutoTokenizer",
   "outputs": [],
   "execution_count": 38
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-05-20T04:22:58.770046Z",
     "start_time": "2025-05-20T04:22:58.309441Z"
    }
   },
   "cell_type": "code",
   "source": "tokenizer = AutoTokenizer.from_pretrained(\"hfl/chinese-macbert-base\")",
   "id": "c14ae15342060976",
   "outputs": [],
   "execution_count": 39
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-05-20T04:22:58.780670Z",
     "start_time": "2025-05-20T04:22:58.777229Z"
    }
   },
   "cell_type": "code",
   "source": [
    "sen = \"这个地方是个interesting\"\n",
    "rt = tokenizer(text=sen, text_pair=sen, text_target=sen, return_offsets_mapping=True)\n",
    "rt"
   ],
   "id": "50313a27887e7a3",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'input_ids': [101, 6821, 702, 1765, 3175, 3221, 702, 10673, 12865, 12921, 8181, 102, 6821, 702, 1765, 3175, 3221, 702, 10673, 12865, 12921, 8181, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'offset_mapping': [(0, 0), (0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 9), (9, 12), (12, 16), (16, 17), (0, 0), (0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 9), (9, 12), (12, 16), (16, 17), (0, 0)], 'labels': [101, 6821, 702, 1765, 3175, 3221, 702, 10673, 12865, 12921, 8181, 102]}"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 40
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-05-20T04:22:58.833449Z",
     "start_time": "2025-05-20T04:22:58.830241Z"
    }
   },
   "cell_type": "code",
   "source": "rt.word_ids()",
   "id": "16319b9c3aeace79",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[None, 0, 1, 2, 3, 4, 5, 6, 6, 6, 6, None, 0, 1, 2, 3, 4, 5, 6, 6, 6, 6, None]"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 41
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-05-20T04:22:58.875439Z",
     "start_time": "2025-05-20T04:22:58.871979Z"
    }
   },
   "cell_type": "code",
   "source": "rt.sequence_ids()",
   "id": "23602e2cc7b459d2",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, None]"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 42
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-05-20T04:22:58.910693Z",
     "start_time": "2025-05-20T04:22:58.907758Z"
    }
   },
   "cell_type": "code",
   "source": "rt[\"token_type_ids\"]",
   "id": "73f667116babe730",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 43
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-05-20T04:22:58.946768Z",
     "start_time": "2025-05-20T04:22:58.943589Z"
    }
   },
   "cell_type": "code",
   "source": [
    "for i in rt[\"input_ids\"]:\n",
    "    print(tokenizer.decode(i))"
   ],
   "id": "e515917b22059908",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[CLS]\n",
      "这\n",
      "个\n",
      "地\n",
      "方\n",
      "是\n",
      "个\n",
      "int\n",
      "##ere\n",
      "##stin\n",
      "##g\n",
      "[SEP]\n",
      "这\n",
      "个\n",
      "地\n",
      "方\n",
      "是\n",
      "个\n",
      "int\n",
      "##ere\n",
      "##stin\n",
      "##g\n",
      "[SEP]\n"
     ]
    }
   ],
   "execution_count": 44
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-05-20T04:22:59.002628Z",
     "start_time": "2025-05-20T04:22:59.000726Z"
    }
   },
   "cell_type": "code",
   "source": "",
   "id": "201d9d839da03bf3",
   "outputs": [],
   "execution_count": null
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
